# -*- coding:utf-8 -*-
from __future__ import division
from sklearn.cluster import KMeans
import util.word_seg_util as wordsutil

__author__ = 'shudongma.msd(风骐)'



# 格式化数据集
# datalist 格式为 [[该行记录特征词]...]
def format_dataset(datalist):
    keyset = []
    for values in wordsutil.stock_mood_dict.values():
        keyset.extend(values)
    keyset.extend(wordsutil.corpus.split())
    dataMat = []
    for row in datalist:
        tmprow = []
        for key in keyset:
            if key in row:
                tmprow.append(1)
            else:
                tmprow.append(0)
        dataMat.append(tmprow)
    return dataMat


def kMean_cluster(datalist):
    dataMat = format_dataset(datalist)
    clf = KMeans(n_clusters=3)
    return clf.fit_predict(dataMat)